Load packages

library(Seurat)
library(ggplot2)
library(patchwork)
library(knitr)
library(dplyr)

Load data

control_data <- Read10X(data.dir = 'GSE276251_RAW/control/')
control <- CreateSeuratObject(counts = control_data, project = 'control', min.cells = 3, min.features = 200)

high_glucose_data <- Read10X(data.dir = 'GSE276251_RAW/high_glucose/')
high_glucose <- CreateSeuratObject(counts = high_glucose_data, project = 'high_glucose', min.cells = 3, min.features = 200)

Combine CONTROL and HIGH GLUCOSE objects

# combine CONTROL and HIGH GLUCOSE
combined <- merge(control, y = high_glucose, add.cell.ids = c("Control", "HighGlucose"))
head(combined@meta.data, 5)
##                            orig.ident nCount_RNA nFeature_RNA
## Control_AAACCCAAGACCAGCA-1    control      20307         3155
## Control_AAACCCAAGCCTGGAA-1    control       9374         1963
## Control_AAACCCACACACCTAA-1    control      10715         2117
## Control_AAACCCACAGAACATA-1    control       6655         1959
## Control_AAACCCAGTGCCCAGT-1    control       5792         1793

1. Filter out low-quality cells

# calculate mitochondrial percentages
combined[["percent.mt"]] <- PercentageFeatureSet(combined, pattern = "^mt-")
head(combined@meta.data, 5)
##                            orig.ident nCount_RNA nFeature_RNA percent.mt
## Control_AAACCCAAGACCAGCA-1    control      20307         3155   3.210716
## Control_AAACCCAAGCCTGGAA-1    control       9374         1963   1.568167
## Control_AAACCCACACACCTAA-1    control      10715         2117   3.070462
## Control_AAACCCACAGAACATA-1    control       6655         1959   1.066867
## Control_AAACCCAGTGCCCAGT-1    control       5792         1793   5.231354
# create violin plots
VlnPlot(combined, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# plot mitochondria percentage and feature counts against RNA counts
plot1 <- FeatureScatter(combined, feature1 = "nCount_RNA", feature2 = "percent.mt") + scale_y_continuous(breaks = seq(0, 100, by = 5)) + geom_point(alpha = 0.05)

plot2 <- FeatureScatter(combined, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + scale_y_continuous(breaks = seq(0, 8500, by = 500)) + geom_point(alpha = 0.05)

plot1 + plot2

Filtration: (200 > nFeature_RNA > 7000) and (percent.mt < 10)

Discussion: Based on the plots, I decided to choose these filtration parameters. I applied the same filtration on CONTROL and HIGH GLUCOSE for comparability. High percentage of mitochonrial percentage indicate that the cell is stressed/dying. Therefore, filtering high mitochondrial percentage will ensure that DEGs observed would be due to biological variation rather than data variability.

# filter meta.data based on the conditions
combined <- subset(combined, subset = nFeature_RNA > 200 & nFeature_RNA < 7000 & percent.mt < 10)

2. Clustering Analysis

Discussion:

SCTransform, as mentioned in the paper by Choudhary et al., replaces the need to run NormalizeData, FindVariableFeatures, and ScaleData separately. It also tackles with confounding due to mitochondrial percentage during normalizatoin. However, I decided not to use SCTransform.

# normalize data to reduce the effect of highly expressed genes
combined <- NormalizeData(combined, normalization.method = "LogNormalize", scale.factor = 10000)
# calculate a subset of features that exhibit high cell-to-cell variation
combined <- FindVariableFeatures(combined, selection.method = "vst", nfeatures = 2000)
# scale the data for PCA
# scaling performs linear transformation to get mean of 0 and variation of 1
all_genes_combined <- rownames(combined)
combined <- ScaleData(combined, features = all_genes_combined)
# perform PCA
combined <- RunPCA(combined, features = VariableFeatures(object = combined))
# analyze elbow plot to find optimal PC dimension
ElbowPlot(combined, ndims=50) + ggtitle('Elbowplot of PC Dimension vs. Std Dev')

Discussion:

I chose 20 as the elbow point. Although 10 PCs is usually picked, but based on this elbowplot I chose 20 PCs, as up to this point we can expect to gain significant information.

Additionally, I used the clustering resolution at 0.4, which is crucial for accurate interpretation.

# perform clustering
uninteg <- FindNeighbors(combined, dims = 1:20, reduction = "pca")
uninteg <- FindClusters(uninteg, resolution = .4, cluster.name = "unintegrated_clusters")
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 11938
## Number of edges: 420191
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9388
## Number of communities: 22
## Elapsed time: 1 seconds
# perform UMAP for visualization
uninteg <- RunUMAP(uninteg, dims = 1:20, reduction = "pca", reduction.name = "umap.unintegrated")
## 02:19:06 UMAP embedding parameters a = 0.9922 b = 1.112
## 02:19:06 Read 11938 rows and found 20 numeric columns
## 02:19:06 Using Annoy for neighbor search, n_neighbors = 30
## 02:19:06 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 02:19:07 Writing NN index file to temp file /var/folders/1k/jj40mz3j3zj2w9gsgg7xpylm0000gn/T//RtmpWjRnAu/file9d8920272da2
## 02:19:07 Searching Annoy index using 1 thread, search_k = 3000
## 02:19:10 Annoy recall = 100%
## 02:19:10 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 02:19:10 Initializing from normalized Laplacian + noise (using RSpectra)
## 02:19:11 Commencing optimization for 200 epochs, with 519932 positive edges
## 02:19:15 Optimization finished
# visulize clusters
DimPlot(uninteg, reduction = "umap.unintegrated", label = TRUE) + ggtitle('Dimplot of clusters')

3. Integrate CONTROL and HIGH GLUCOSE datasets

# integrate the splits on CONTROL and HIGH GLUCOSE
integ <- IntegrateLayers(object = combined, method = CCAIntegration, orig.reduction = "pca", new.reduction = "integrated.cca",
    verbose = FALSE)
# join the integrated layers
integ[["RNA"]] <- JoinLayers(integ[["RNA"]])
# perform clustering
integ <- FindNeighbors(integ, reduction = "integrated.cca", dims = 1:20)
integ <- FindClusters(integ, resolution = .4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 11938
## Number of edges: 427051
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9383
## Number of communities: 20
## Elapsed time: 1 seconds
# perform UMAP for visualization
integ <- RunUMAP(integ, dims = 1:20, reduction = "integrated.cca")
## 02:20:59 UMAP embedding parameters a = 0.9922 b = 1.112
## 02:20:59 Read 11938 rows and found 20 numeric columns
## 02:20:59 Using Annoy for neighbor search, n_neighbors = 30
## 02:20:59 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 02:21:00 Writing NN index file to temp file /var/folders/1k/jj40mz3j3zj2w9gsgg7xpylm0000gn/T//RtmpWjRnAu/file9d89549bf4b8
## 02:21:00 Searching Annoy index using 1 thread, search_k = 3000
## 02:21:02 Annoy recall = 100%
## 02:21:03 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 02:21:03 Initializing from normalized Laplacian + noise (using RSpectra)
## 02:21:04 Commencing optimization for 200 epochs, with 521170 positive edges
## 02:21:08 Optimization finished
# visualization
DimPlot(integ, reduction = "umap", label = TRUE) + ggtitle('Dimplot of clusters after integration')

4. Identify differentially expressed genes (DEGs)

Discussion:

I decided not to use pseudobulk approach using AggregateExpression. Rather I created my own object to contain all the DEGs for all exhaustive cluster-condition pairs.

Furthermore, I kept the clusters as (0,1,2,3,..) instead of naming them as mentioned.

# create a new column that combines cluster and group (control/high glucose) information
integ$cond_cluster <- paste(Idents(integ), ifelse(integ$orig.ident == "control", "Con", "HG"), sep = '_')
Idents(integ) <- integ$cond_cluster
# visualize CONTROL and HIGH GLUCOSE separately
DimPlot(integ, reduction='umap', label = TRUE, split.by = 'orig.ident') + ggtitle('Dimplot of clusters by control(Con)  and high glucose(HG) group')

# check the indentifiers
table(Idents(integ))
## 
##  2_Con  5_Con  1_Con 10_Con 15_Con  0_Con  9_Con  4_Con 11_Con 16_Con  3_Con 
##    862    395   1403    224    163   2124    202    494    173    113    573 
## 13_Con 14_Con 17_Con 12_Con  7_Con  8_Con  6_Con 19_Con 18_Con   2_HG   1_HG 
##    153    155     35    227    288    281    270     25     38    317    281 
##  10_HG   0_HG   5_HG   9_HG   3_HG  13_HG   4_HG  18_HG  14_HG   7_HG  11_HG 
##    108    994    184    156    404     91    300     26     65    186    133 
##   8_HG  19_HG  17_HG   6_HG  16_HG  12_HG  15_HG 
##     79     21     64    248     38     20     25
# List for contating DEGs for each cluster groups
top_DEGs_by_cluster_condition <- list()
# get unique clusters and conditions
clusters <- unique(integ$seurat_clusters)
conditions <- unique(integ$orig.ident)

Discussion:

I only did control_# vs high_glucose_# since the absolute avg_log2FC values are the same if they are flipped i.e. if the control and high_glucose groups are flipped in FindMarkers, the signs of avg_log2FC values get flipped. Therefore, I decided not to include both directions for simplicity.

In my analysis I am calculating the 10 most differentially expressed genes (DEGs) regardless of direction (positive or negative). The values are sorted based on the absolute values of avg_log2FC, however I am keeping their signed value for relaying the mutual direction.

# calculate DEGs for cluster-condition pairs
for (cluster in clusters) {
      # create identifiers for the cluster groups
      ident.1 <- paste(cluster, 'Con', sep = "_")
      ident.2 <- paste(cluster, 'HG', sep = "_")

      # find markers between the two conditions in the same cluster
      markers <- FindMarkers(integ, ident.1 = ident.1, ident.2 = ident.2)
      # get top genes based on highest absolute avg_log2FC
      top_genes <- head(markers[order(-abs(markers$avg_log2FC)), ], n = 10) 
      top_DEGs_by_cluster_condition[[paste("Cluster", cluster, "CONTROL", "and GLUCOSE")]] <- top_genes
}